import pandas as pd

# Load the cleaned dataset (household-year level) - updated to task5 version
df = pd.read_csv('cleaned_with_male_head.csv')

# Group by province to get number of observations (household-years)
prov_obs = df.groupby('province').size().reset_index(name='num_households')

# Compute (%) for number of households (observations)
total_households = prov_obs['num_households'].sum()
prov_obs['pct'] = (prov_obs['num_households'] / total_households * 100).round(2)

# NEW: Compute male-headed % dynamically from 'male_head' column
male_headed = df.groupby('province')['male_head'].mean() * 100
prov_obs['male_headed_pct'] = prov_obs['province'].map(male_headed.round(2).to_dict())

# Region mapping from paper
region_map = {
    'Ha Tay': 'Red River Delta',
    'Lao Cai': 'Northern Midlands and Mountainous Area',
    'Phu Tho': 'Northern Midlands and Mountainous Area',
    'Lai Chau': 'Northern Midlands and Mountainous Area',
    'Dien Bien': 'Northern Midlands and Mountainous Area',
    'Nghe An': 'North Central and Central Coastal Area',
    'Quang Nam': 'North Central and Central Coastal Area',
    'Khanh Hoa': 'North Central and Central Coastal Area',
    'Dak Lak': 'Central Highlands',
    'Dak Nong': 'Central Highlands',
    'Lam Dong': 'Central Highlands',
    'Long An': 'Mekong River Delta'
}

prov_table = prov_obs.copy()
prov_table['Region'] = prov_table['province'].map(region_map)

# Sort by region order as in paper
region_order = {
    'Red River Delta': 1,
    'Northern Midlands and Mountainous Area': 2,
    'North Central and Central Coastal Area': 3,
    'Central Highlands': 4,
    'Mekong River Delta': 5
}
prov_table['region_sort'] = prov_table['Region'].map(region_order)
prov_table = prov_table.sort_values(['region_sort', 'province']).drop('region_sort', axis=1)

# Add total row
total_male_pct = (prov_table['male_headed_pct'] * prov_table['num_households']).sum() / total_households
total_row = pd.DataFrame({
    'Region': ['Total'],
    'province': [''],
    'num_households': [total_households],
    'pct': [100.00],
    'male_headed_pct': [round(total_male_pct, 2)]
}, index=[len(prov_table)])

table8 = pd.concat([prov_table, total_row]).reset_index(drop=True)

# Select and rename columns to match paper
table8 = table8[['Region', 'province', 'num_households', 'pct', 'male_headed_pct']]
table8.columns = ['Region', 'Province', 'Number of households', '(%)', 'Male-headed households (%)']

# Adjust province name for Ha Tay as in paper
table8['Province'] = table8['Province'].replace('Ha Tay', 'Ha Tay (Hanoi)')

# Print and save
print("\nTable 8: Statistical characteristics of households in 12 provinces and socio-economic regions\n")
print(table8)
table8.to_csv('revise_table_8_household_stats.csv', index=False)
print("Saved table8_household_stats.csv")
print("\nNote: Male-headed % computed dynamically from 'male_head' column.")